# -*- coding:utf8 -*-
# !/usr/bin/env python

'''
#全国企业信用信息公示系统（重庆）
'''

import requests
import urllib2
import urllib
from lxml import etree
import cookielib
import json
import re
import base64
# from urllib2 import Request, urlopen, URLError, HTTPError
from scpy.logger import get_logger

from crawler.utils import kill_captcha

logger = get_logger(__file__)


#
class SearchCq(object):
    def __init__(self):
        self.company_dict = {
            'IcPublic': [],
            'allAnnualReports': [],
        }

        self.companyIDs = {
            'companyID_str': '',
            'data_entid': '',
            'companyName_str': '',
        }

        # 下载验证码的地址
        self.img_urlStr = r'http://gsxt.cqgs.gov.cn/sc.action?width=130&height=40&fs=23'

        # 城市
        self.sourceStr = 'cq'
        # 验证码格式
        self.imgformat = 'png'

        # 发送验证码到服务器,识别验证码的请求地址
        self.req_urlStr = r'http://192.168.31.121:44444/captcha'

        # 搜索公司的search页面, http://gsxt.cqgs.gov.cn/search.action,
        # 需要从该界面获取公司id,entId,name,
        self.cqSearchPage_urlStr = r'http://gsxt.cqgs.gov.cn/search.action'

        self.get_company_type_url = r'http://gsxt.cqgs.gov.cn/search_ent'

        # Get方法获得公司详情,主页,工商公示信息
        # 需要：公司id,entId
        self.cqGetCompanyDetailInfo_urlStr = r'http://gsxt.cqgs.gov.cn/search_getEnt.action?'


        # 企业年报
        # 'http://gsxt.cqgs.gov.cn/search_getYearReport.action?id=500901000031324&type=1'

        # 具体那一年的企业年报，2014年
        # 'http://gsxt.cqgs.gov.cn/search_getYearReportDetail.action?id=500901000031324&type=1&year=2014'

    # 用于扩展分词和模糊搜索
    def __getUserKeyword(self, key):

        self.key = key
        return self.key

    def __downloadImgCqOne(self):
        '''
        从网站下载验证码,一次
        :return:验证码的图片文件
        #发送验证码到服务器,识别验证码
        识别过程发生错误时返回'';识别过程正确返回结果,PS，可能识别的结果可能是错误的
        '''

        # 设置请求头
        cookiejar = cookielib.CookieJar()
        urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
        urllib2.install_opener(urlopener)

        # 只需要设置的Connection为'Keep-Alive'即可
        # 可以设置需要设置ip池,
        urlopener.addheaders.append(('Host', 'gsxt.cqgs.gov.cn/'))
        urlopener.addheaders.append(
            ('User-Agent', 'Mozilla/5.0 (X11;Ubuntu;Linux x86_64;rv:40.0) Gecko/20100101 Firefox/40.0'))
        urlopener.addheaders.append(('Accept-Language', 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'))
        urlopener.addheaders.append(('Referer', 'http://gsxt.cqgs.gov.cn/'))
        urlopener.addheaders.append(('Connection', 'Keep-Alive'))

        try:
            cqSearchPageGetImg_Bin = urlopener.open(urllib2.Request(self.img_urlStr)).read()
            # print cqSearchPageGetImg_Bin
            return cqSearchPageGetImg_Bin
        except urllib2.HTTPError, e:
            logger.error(e)
            raise e
        except urllib2.URLError, e:
            logger.error(e)
            raise e
        except Exception, e:
            logger.error(e)
            raise e
            # 输出,仅调试
            # with open(r'/home/huangyu/下载/test.png','wb') as htmlfile:
            #     htmlfile.write(cqSearchPageGetImg_Bin)

    def __downloadImgCq(self):

        MaxTimes = 10
        TryDownloadTimes = MaxTimes
        cqSearchPageGetImg_Bin = self.__downloadImgCqOne()

        while (not cqSearchPageGetImg_Bin) and (TryDownloadTimes > 0):
            cqSearchPageGetImg_Bin = self.__downloadImgCqOne()
            TryDownloadTimes -= 1
            logger.error("多次下载验证码错误,剩余次数为 %s：" % (MaxTimes - TryDownloadTimes))

        if cqSearchPageGetImg_Bin:
            return cqSearchPageGetImg_Bin
        else:
            logger.error("多次下载验证码错误,当前设定次数为 %s" % MaxTimes)
            raise ValueError

    def __crackImgCq(self, imgBin):
        '''
        #发送验证码到服务器识别
        :param imgBin:图片文件
        :return:识别结果（数字或者文字）
        '''
        # 对下载的验证码进行base64编码
        # import pdb
        # pdb.set_trace()
        if not imgBin:
            logger.error("image downloaded is void !")
            return None
        else:
            res_code = kill_captcha(imgBin, self.sourceStr, self.imgformat)
        if res_code:
            return res_code
        else:
            logger.exception("破解回来的验证码为空")
            return None

    def __getCompany(self, res_code):
        '''
        #
        :param key:
        :return:
        '''

        # try:
        #     res_Code = self.__crackImgCq(self.__downloadImgCq(self))#判断验证码是否在识别的过程中出错
        # except:
        #     res_Code = ''

        # import pdb
        # pdb.set_trace()

        if res_code:  # 若识别了验证码
            CqSearchPageRequest_Dict = {
                'key': self.key,
                # 'key': '重庆猪八戒网络有限公司',
                'code': res_code,
            }

            cqSearchPage_RequestDictEncode = urllib.urlencode(CqSearchPageRequest_Dict)
            try:
                cqSearchPage_Html = urllib2.urlopen(self.cqSearchPage_urlStr, cqSearchPage_RequestDictEncode).read()
            except Exception, e:
                logger.error(e)
                return None

            # with open(r'/home/huangyu/下载/test.html','w') as htmlfile:
            #     htmlfile.write(cqSearchPage_Html)

            # 用Xpath获取返回的公司的数量，#
            # 若找到一条，返回搜索结果,返回公司名字、公司id#
            # 若找到多条，则返回空第一条#
            # 若找到0条，则返回Null
            # 若验证码错误返回 ''

            try:
                '''
                #待完善，目前为1个公司
                '''
                cqSearchPage_tree = etree.HTML(cqSearchPage_Html)

                if cqSearchPage_tree.xpath('//*[@id="wrap"]/div[1]/text()') == [
                    u'\r\n\t\t \t\t\u9a8c\u8bc1\u7801\u4e0d\u6b63\u786e\r\n\t\t\t\t']:
                    logger.info("验证码验证失败,当前验证码为：%s" % res_code)
                    return None

                elif cqSearchPage_tree.xpath('//*[@id="tip"]/text()') == [
                    u'>> \u60a8\u641c\u7d22\u7684\u6761\u4ef6\u65e0\u67e5\u8be2\u7ed3\u679c   <<']:
                    logger.info("没有查询到公司，请更换关键字！")
                    return ''
                else:  # 验证码正确
                    # 500103010100009260
                    data_entid = cqSearchPage_tree.xpath('//*[@id="result"]/div/a/@data-entid')

                    # 注册ID
                    companyID_str = cqSearchPage_tree.xpath('//*[@id="result"]/div/a/@data-id')

                    # 公司
                    companyName_str = cqSearchPage_tree.xpath('//*[@id="result"]/div/a/text()')
                    # companyID_str = ''
                    # companyName_str = ''

                    # 公司type
                    companyType_str = cqSearchPage_tree.xpath('//*[@id="result"]/div/a/@data-type')
                    logger.info("网站返回的公司类型：%s" % companyType_str[0])

                    # #格式化data-type为传给浏览器的type
                    # companyType_int = int(companyType_str[0])
                    # if companyType_int > 1100:
                    #     companyType_int /= 1000
                    # elif companyType_int > 1000:
                    #     companyType_int /= 100
                    # companyType_str[0] = str(companyType_int)
                    #
                    # logger.info("算出来的的公司类型：%s" % companyType_str[0])

                    companyIDs = {
                        'companyID_str': companyID_str[0],
                        'data_entid': data_entid[0],
                        'companyName_str': companyName_str[0],
                        'companyType_str': companyType_str[0],
                    }
                    return companyIDs
            except Exception, e:
                logger.exception(e)
                return None
        else:
            logger.exception("验证码为空")
            return None

    def __get_company_type(self, companyIDs):

        if companyIDs:
            get_company_type_dict = {
                'id': companyIDs['companyID_str'],
                'type': companyIDs['companyType_str'],
                'name': companyIDs['companyName_str'],
                'entId': companyIDs['data_entid'],
            }

            get_company_type_dict_encode = urllib.urlencode(get_company_type_dict)
            try:
                get_company_type_html = urllib2.urlopen(self.get_company_type_url, get_company_type_dict_encode).read()
            except Exception, e:
                logger.error(e)
                return None
            type_str = \
            re.compile('\d+').findall(re.compile('.*ng-init=.*type=(.*);name=.*').findall(get_company_type_html)[0])[0]
            companyIDs['companyType_str'] = type_str
            # import pdb
            # pdb.set_trace()
            return companyIDs

    def __getCompanyIcPost(self, companyIDs):
        '''
        获取公司的工商信息
        :param companyIDs:公司的基本信息，类型：dict
        :return:cqGetCompanyDetailInfo_Dict：以dict的形式返回工商信息或None
        '''
        # Get方法获得公司详情,主页,工商公示信息
        # 需要：公司id,entId


        # companyIDs = self.__getCompany(self.key)
        if companyIDs:
            cqGetCompanyDetailInfo_Dict = {
                'id': companyIDs['companyID_str'],  # '500901000031324'
                'type': companyIDs['companyType_str'],
                'entId': companyIDs['data_entid'],  # '500103010100009260'
            }

            try:
                cqGetCompanyDetailInfo_JsonRaw = urllib2.urlopen(
                    self.cqGetCompanyDetailInfo_urlStr + urllib.urlencode(cqGetCompanyDetailInfo_Dict)).read()

                cqGetCompanyDetailInfo_Json = cqGetCompanyDetailInfo_JsonRaw[6:]

                cqGetCompanyDetailInfo_Dict = json.loads(cqGetCompanyDetailInfo_Json)

                # print '*****',cqGetCompanyDetailInfo_Dict,'*****'
                # json.dumps(cqGetCompanyDetailInfo_Dict,ensure_ascii=False)
                # import pdb
                # pdb.set_trace()

                # print cqGetCompanyDetailInfo_Dict
                # self.company_dict['IcPublic'] = cqGetCompanyDetailInfo_Dict

                return cqGetCompanyDetailInfo_Dict
            except Exception, e:
                logger.exception(e)
                raise e

                # print json.dumps(cqGetCompanyDetailInfo_Dict, sort_keys=True, indent=4)
        else:
            logger.error("获取公司的注册号失败！")
            return ''

    def __annualReports(self, companyIDs):
        '''
        获取公司年报
        :param companyIDs:公司注册号等信息
        :return:
        '''

        if companyIDs:
            companyID_str = companyIDs['companyID_str']

            # //*[@id="yearReport"]/table[1]/tbody/tr
            # 企业年报
            # companyID_str = '500901000031324'
            # 获取年份
            annualReportsPageHtml_str = 'http://gsxt.cqgs.gov.cn/search_getYearReport.action?id=' + companyID_str + '&type=' + \
                                        companyIDs['companyType_str']
            # print '***'+annualReportsPageHtml_str+'*****'
            try:
                annualReports_jsonRaw = urllib2.urlopen(annualReportsPageHtml_str).read()
                annualReports_json = annualReports_jsonRaw[6:]
                annualReports_dict = json.loads(annualReports_json)
                # print annualReports_dict
                # annualReports_num = annualReports_dict['history'].__len__()
                # annualReports_dict['history'] 是一个list
            except Exception, e:
                logger.error("获取年报失败")
                logger.exception(e)
                raise e

            # 具体那一年的企业年报，2014年
            # 'http://gsxt.cqgs.gov.cn/search_getYearReportDetail.action?id=500901000031324&type=1&year=2014'
            year_list = []

            if 'history' in annualReports_dict and annualReports_dict['history']:
                try:
                    # 获取每一年的年报详情
                    for ayear_dict in annualReports_dict['history']:
                        # print ayear_dict['year']

                        ayear_str = 'http://gsxt.cqgs.gov.cn/search_getYearReportDetail.action?id=' + companyID_str + '&type=' + \
                                    companyIDs['companyType_str'] + '&year=' + str(ayear_dict['year'])
                        # print ayear_str

                        ayearReports_json = urllib2.urlopen(ayear_str).read()

                        ayearReports_list = json.loads(ayearReports_json)
                        ayearReports_list["year"] = str(ayear_dict['year'])

                        year_list.append(ayearReports_list)
                        # print year_list
                        # import pdb
                        # pdb.set_trace()

                    return year_list
                except Exception, e:
                    logger.exception(e)
                    logger.info("获取年报失败")
                    raise e

            else:
                logger.info("该公司没有年报信息")
                return []
        else:
            logger.error('Input companyIDs dict Error!')
            return None

    # 外部调用的函数
    def getCqCompanyInfo(self, key):
        '''
        #
        :param key:搜索公司名字
        :return: company_dict，以2字典格式返回企业信息
        '''


        # self.__init__()
        # 重复发送验证码的次数
        MAX_TIME = 20
        self.__redotimes = MAX_TIME
        temID = None

        # 搜索关键字
        self.__getUserKeyword(key)
        # print '1'

        # 获取公司的注册号、完整名字、type等信息，返回compangIDs，多次发送验证码
        while self.__redotimes > 0:
            imgBin = self.__downloadImgCq()
            # print '2',imgBin

            res_code = self.__crackImgCq(imgBin)
            # print '3',res_code

            temID = self.__getCompany(res_code)

            if temID is not None:  # 验证码破解失败
                logger.info("验证码破解成功!,验证码为:%s" % res_code)
                break
            if temID == '':  # 没有搜索到公司
                logger.info("没有查询到公司，请更换关键字！")
                break

            self.__redotimes -= 1
            logger.error("验证码破解失败，再次破解，剩余次数为： %s" % self.__redotimes)

        # 验证码破解失败，返回None
        if temID == None:
            logger.error("验证码破解失败，当前设置的重复破解次数为： %s" % MAX_TIME)
            raise Exception("验证码破解失败，当前设置的重复破解次数为： %s" % MAX_TIME)
            # return None
        elif temID == '':  # 没有搜索到公司
            return ''

        self.companyIDs = temID
        # print self.companyIDs
        # print '4'
        self.companyIDs = self.__get_company_type(self.companyIDs)

        companyIc = self.__getCompanyIcPost(self.companyIDs)

        if companyIc:
            self.company_dict['IcPublic'].append(companyIc)
            # print self.company_dict['IcPublic']
            # print '5'
        else:
            logger.error("获取工商信息失败！")
            return None

        allAnnualReports_list = self.__annualReports(self.companyIDs)
        if allAnnualReports_list == None:  # 年报获取异常，反None,注意与年报获取为空相区别
            return None
        else:
            self.company_dict['allAnnualReports'].append(allAnnualReports_list)
        # print self.company_dict['allAnnualReports']
        # import pdb
        # pdb.set_trace()

        # 整理格式
        result_dict = {}
        result_dict["allAnnualReports"] = self.company_dict["allAnnualReports"][0]
        result_dict["IcPublic"] = self.company_dict["IcPublic"]
        return result_dict


if __name__ == "__main__":
    aSearchCq = SearchCq()
    test_dict = {}
    test_dict = aSearchCq.getCqCompanyInfo('重庆轴承工业公司汽车轴承厂')

    # acompany_dict = aSearchCq.getCqCompanyInfo('重庆猪八戒网络有限公司')
    # # print acompany_dict
    # test_dict["allAnnualReports"] = acompany_dict["allAnnualReports"][0]
    # test_dict["IcPublic"] = acompany_dict["IcPublic"]
    # print test_dict
    print json.dumps(test_dict, ensure_ascii=True, indent=4)
